Statistical Learning (Machine Learning),
Linear Regression

Module 06

Ray J. Hoobler

Libraries

Code
library(tidyverse)  
library(ggthemes)
library(patchwork)
library(broom)
library(ggtext)
library(latex2exp)

Statistical Learning

What is Statistical Learning? (1/9)

Statistical learning refers to a vast set of tools for understanding data. These tools can be classified as supervised or unsupervised. Broadly speaking, supervised statistical learning involves building a statistical model for predicting, or estimating, an output based on one or more inputs.
- Introduction, ISLR

What is Statistical Learning? (2/9)

\[ Y = f(X_1, X_2, \ldots, X_p) + \epsilon \]

\(f\) is a fixed but unknown function of \(X_1, X_2, \ldots, X_p\), and \(\epsilon\) is a random error term, which is independent of \(X\) and has a mean of zero.

We want to predict a value Y based on the values of X assuming a functional relationship between Y and X.

Code
# sales in thousands of units
# market budgets in thousands of dollars
advertising <- read_csv("datasets/Advertising.csv", col_names = TRUE, 
                        col_select = c(2:5), show_col_types = FALSE)
Code
p1 <- advertising |> 
  ggplot(aes(x = TV, y = Sales)) +
  geom_point(color = "red", shape = "circle open", stroke = 1) +
  geom_smooth(method = "lm", color = "blue", se = FALSE, linewidth = 2) +
  labs(x = "TV", y = "Sales") +
  scale_y_continuous(breaks = seq(0, 30, 5)) +
  theme_light() 

p2 <- advertising |> 
  ggplot(aes(x = Radio, y = Sales)) +
  geom_point(color = "red", shape = "circle open", stroke = 1) +
  geom_smooth(method = "lm", color = "blue", se = FALSE, linewidth = 2) +
  labs(x = "Radio", y = "Sales") +
  scale_y_continuous(breaks = seq(0, 30, 5)) +
  theme_light()

p3 <- advertising |>
  ggplot(aes(x = Newspaper, y = Sales)) +
  geom_point(color = "red", shape = "circle open", stroke = 1) +
  geom_smooth(method = "lm", color = "blue", se = FALSE, linewidth = 2) +
  labs(x = "Newspaper", y = "Sales") +
  scale_y_continuous(breaks = seq(0, 30, 5)) +
  theme_light()

p1 + p2 + p3

What is Statistical Learning? (3/9)

Code
income_1 <- read_csv("datasets/Income1.csv", col_names = TRUE, col_select = c(2:3), col_types = "dd")

# income_1

income_1 |> 
  ggplot(aes(x = Education, y = Income)) +
  geom_point(color = "red", shape = "circle open", stroke = 1) +
  geom_smooth(method = "lm", color = "blue", se = FALSE) +
  geom_smooth(method = "loess", color = "orange", se = FALSE) +
  labs(x = "Years of Education", y = "Income") +
  scale_y_continuous(breaks = seq(0, 100, 10)) +
  scale_x_continuous(limits = c(10, 22), breaks = seq(10, 22, 2)) +
  theme_light()

\[ \textbf{Income} = \beta_0 + \beta_1 \times \textbf{Education} \\ \, \\ \textbf{Income} = \frac{a}{1 + \exp(-b(\textbf{Education} - c))} + d \]


How do we choose our model?

What is Statistical Learning? (4/9)

Code
income_function <- function(x, a, b, c, d) {
 d + a / (1 + exp(-b * (x - c)))
}

a <- 60
b <- 0.5
c <- 16
d <- 20

fit <- nls(Income ~ income_function(Education, a, b, c, d), data = income_1,
           start = list(a = a, b = b, c = c, d = d))

coef_fit <- coef(fit)
# coef_fit

income_1_augment <- augment(fit, income_1)

p1 <- income_1_augment |>
  ggplot(aes(x = Education, y = Income)) +
  geom_segment(aes(xend = Education, yend = .fitted), color = "black", linetype = "solid",  linewidth = 0.5) +
  geom_point(color = "red", shape = "circle") +
  geom_function(fun = function(x) income_function(x, 
                                                  a = coef_fit["a"],
                                                  b = coef_fit["b"],
                                                  c = coef_fit["c"],
                                                  d = coef_fit["d"]), 
                color = "blue", linewidth = 1) +
  labs(x = "Years of Education", y = "Income") +
  scale_y_continuous(breaks = seq(0, 100, 10)) +
  scale_x_continuous(limits = c(10, 22), breaks = seq(10, 22, 2)) +
  theme_light()

p2 <- income_1_augment |>
  ggplot(aes(x = Education, y = .resid)) +
  geom_hline(yintercept = 0, color = "black", linetype = "dashed") +
  geom_point(color = "red", shape = "circle") +
  labs(x = "Years of Education", y = "Residuals") +
  scale_y_continuous(limits = c(-10, 10), breaks = seq(-10, 10, 5)) +
  scale_x_continuous(limits = c(10, 22), breaks = seq(10, 22, 2)) +
  theme_light()

p1 / p2 +
  plot_layout(heights = c(3, 1), axes = "collect")

What is Statistical Learning? (5/9)

Code
income_2 <- read_csv("datasets/Income2.csv", col_names = TRUE, col_select = c(2:4), 
                     col_types = "dd")

income_2

What is Statistical Learning? (6/9)

What is Statistical Learning? (7/9)

Code
income_2 |> 
  ggplot(aes(x = Seniority, y = Income)) +
  geom_smooth(method = "lm", se = FALSE, formula = y ~ x) +
  geom_point(aes(color = Education))

What is Statistical Learning? (8/9)

Code
# fit the income data to a function that is sigmoid shape in Education and linear in Seniority

income_function_2 <- function(x, y, a, b, c, d, e) {
  d + a / (1 + exp(-b * (x - c))) + e * y
}

a <- 60
b <- 0.5
c <- 16
d <- 20
e <- 0.5

fit_2 <- nls(Income ~ income_function_2(Education, Seniority, a, b, c, d, e), data = income_2,
           start = list(a = a, b = b, c = c, d = d, e = e))

# summary(fit_2)
# coef_fit_2 <- coef(fit_2)
# coef_fit_2

income_2_augment <- augment(fit_2, income_2)

## plot predicted values against actual values

p1 <- income_2_augment |>
  ggplot(aes(x = Income, y = .fitted)) +
  geom_point(color = "red", shape = "circle") +
  geom_abline(intercept = 0, slope = 1, color = "blue", linetype = "dashed") +
  labs(x = "Actual Income", y = "Predicted Income") +
  scale_x_continuous(breaks = seq(0, 100, 10)) +
  scale_y_continuous(breaks = seq(0, 100, 10)) +
  theme_light()

p2 <- income_2_augment |>
  ggplot(aes(x = Income, y = .resid)) +
  geom_hline(yintercept = 0, color = "black", linetype = "dashed") +
  geom_point(color = "red", shape = "circle") +
  labs(x = "Actual Income", y = "Residuals") +
  scale_x_continuous(breaks = seq(0, 100, 10)) +
  scale_y_continuous(limits = c(-15, 15), breaks = seq(-10, 10, 5)) +
  theme_light()

p1 / p2 +
  plot_layout(heights = c(3, 1), axes = "collect")


\[ \textbf{Income} = \frac{a}{1 + \exp(-b(\textbf{Education} - c))} + d + e (\textbf{Seniority}) \]

What is Statistical Learning? (9/9)

Code
predict_income <- function(education, seniority) {
  coef <- coef(fit_2)
  coef['d'] + coef['a'] / (1 + exp(-coef['b'] * (education - coef['c']))) + coef['e'] * seniority
}

# Create sequences and grid
education_seq <- seq(min(income_2$Education), max(income_2$Education), length.out = 100)
seniority_seq <- seq(min(income_2$Seniority), max(income_2$Seniority), length.out = 100)
grid <- expand.grid(Education = education_seq, Seniority = seniority_seq)
grid$Income <- predict_income(grid$Education, grid$Seniority)


# 3. 3D scatter plot
p3 <- plotly::plot_ly(width = 500, height = 500) %>%
  plotly::add_markers(data = grid, x = ~Education, y = ~Seniority, z = ~Income, 
              marker = list(size = 3, color = ~Income, colorscale = "Viridis"),
              showlegend = FALSE) %>%
  plotly::add_markers(data = income_2, x = ~Education, y = ~Seniority, z = ~Income,
              marker = list(size = 5, color = "red", symbol = "circle"),
              showlegend = FALSE) %>%
  plotly::layout(scene = list(
    xaxis = list(title = "Education"),
    yaxis = list(title = "Seniority"),
    zaxis = list(title = "Income")
  ),
  title = "3D Income Scatter")


# 4. Static 3D surface plot
# png("persp_plot.png", width = 800, height = 600)
# z_matrix <- matrix(grid$Income, nrow = length(education_seq), ncol = length(seniority_seq))
# persp(education_seq, seniority_seq, z_matrix, 
#       theta = 30, phi = 30, expand = 0.5, col = "lightblue",
#       xlab = "Education", ylab = "Seniority", zlab = "Income",
#       main = "3D Surface: Income vs Education and Seniority")
# points <- trans3d(income_2$Education, income_2$Seniority, income_2$Income, 
#                   pmat = persp(education_seq, seniority_seq, z_matrix, theta = 30, phi = 30, expand = 0.5, col = "lightblue"))
# points(points, col = "red", pch = 16)
# dev.off()

# Display 3D scatter plot
# The static 3D surface plot is saved as "persp_plot.png"
p3

Assessing Model Accuracy: Regression (1/6)

\[ MSE = \frac{1}{n} \sum_{i=1}^{n} \left (y_i - \hat{f}(x_i) \right )^2 \]

Where \(y_i\) is the true value of the response for observation \(i\), and \(\hat{f}(x_i)\) is the predicted value.

Assessing Model Accuracy: Regression (2/6)

Create a dummy dataset with 100 observations that follow a polynomial function

Code
set.seed(142)
n <- 100
x <- runif(n, -2, 2)
y <- x^4 + rnorm(n, 0, 2)

poly_data <- tibble(x = x, y = y)

p1 <- poly_data |> 
  ggplot(aes(x = x, y = y)) +
  geom_point(color = "red", shape = "circle open", stroke = 1) +
  labs(x = "x", y = "y") +
  theme_light()

p1

Assessing Model Accuracy: Regression (3/6)

Split the data into training and testing sets

Code
set.seed(142)
train_index <- sample(1:n, n * 0.8)
# train_data <- poly_data[train_index, ]
# test_data <- poly_data[-train_index, ]

poly_data_sample <- poly_data |> 
  mutate(
    sample = ifelse(row_number() %in% train_index, "Train", "Test")
  )

# poly_data_sample

p1 <- poly_data_sample |> 
  ggplot(aes(x = x, y = y)) +
  geom_point(aes(color = sample, shape = sample), size = 2) +
  scale_color_colorblind() +
  labs(x = "x", y = "y") +
  theme_light()

p1

Assessing Model Accuracy: Regression (4/6)

Fit a polynomial regression model to the training data

Code
poly_fit_train <- lm(y ~ poly(x, degree = 3), data = filter(poly_data_sample, sample == "Train"))
summary(poly_fit_train)

Call:
lm(formula = y ~ poly(x, degree = 3), data = filter(poly_data_sample, 
    sample == "Train"))

Residuals:
    Min      1Q  Median      3Q     Max 
-3.9110 -1.3845 -0.0798  1.2008  8.4279 

Coefficients:
                     Estimate Std. Error t value Pr(>|t|)    
(Intercept)            3.0068     0.2453  12.257   <2e-16 ***
poly(x, degree = 3)1   4.0329     2.1942   1.838    0.070 .  
poly(x, degree = 3)2  32.5005     2.1942  14.812   <2e-16 ***
poly(x, degree = 3)3   2.5805     2.1942   1.176    0.243    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.194 on 76 degrees of freedom
Multiple R-squared:  0.7468,    Adjusted R-squared:  0.7368 
F-statistic: 74.72 on 3 and 76 DF,  p-value: < 2.2e-16

Plot the polynomial regression model

Code
poly_data_augment <- augment(poly_fit_train, filter(poly_data_sample, sample == "Train"))

p1 <- poly_data_augment |> 
  ggplot(aes(x = x, y = y)) +
  geom_point(color = "red", shape = "circle open", stroke = 1) +
  geom_line(aes(y = .fitted), color = "blue", size = 1) +
  labs(x = "x", y = "y") +
  theme_light()

p1

Assessing Model Accuracy: Regression (5/6)

Show the fit to the test data

Code
p1 <- poly_data_augment |> 
  ggplot(aes(x = x, y = y)) +
  geom_point(data = filter(poly_data_sample, sample == "Test"), 
             color = "red", shape = "circle open", stroke = 1) +
  geom_line(aes(y = .fitted), color = "blue", size = 1) +
  labs(x = "x", y = "y") +
  theme_light()

p1

Assessing Model Accuracy: Regression (6/6)

Calculate MSE for the training and test data as a function of polynomial degree

Code
# Print summary of the data
# print(summary(poly_data_sample))

# Check the proportion of train/test split
# print(table(poly_data_sample$sample) / nrow(poly_data_sample))

# Initialize vectors to store MSE values
mse_train <- numeric(10)
mse_test <- numeric(10)

for (i in 1:10) {
  # Fit the model on training data
  poly_fit_train <- lm(y ~ poly(x, degree = i, raw = TRUE), 
                       data = filter(poly_data_sample, sample == "Train"))
  
  # Calculate MSE for training data
  train_augment <- augment(poly_fit_train, newdata = filter(poly_data_sample, sample == "Train"))
  mse_train[i] <- mean(train_augment$.resid^2)
  
  # Calculate MSE for test data
  test_augment <- augment(poly_fit_train, newdata = filter(poly_data_sample, sample == "Test"))
  mse_test[i] <- mean((test_augment$y - test_augment$.fitted)^2)
  
  # Print MSE for this degree
  # cat("Degree", i, "- Train MSE:", mse_train[i], "Test MSE:", mse_test[i], "\n")
}

# Create a data frame with results
mse_results <- data.frame(
  degree = 1:10,
  train_mse = mse_train,
  test_mse = mse_test
)

# Print results
# mse_results

# Plot MSE vs Polynomial Degree
ggplot(mse_results, aes(x = degree)) +
  geom_smooth(aes(y = train_mse, color = "Training"), se = FALSE, formula = y ~ x, method = "loess") +
  geom_smooth(aes(y = test_mse, color = "Test"), se = FALSE, formula = y ~ x, method = "loess") +
  geom_point(aes(y = train_mse, color = "Training")) +
  geom_point(aes(y = test_mse, color = "Test")) +
  scale_color_manual(values = c("Training" = "blue", "Test" = "red")) +
  labs(title = "MSE vs Polynomial Degree",
       x = "Polynomial Degree",
       y = "Mean Squared Error",
       color = "Dataset") +
  theme_light() +
  scale_y_continuous(expand = c(0, 0), limits = c(0, NA)) +
  scale_x_continuous(limits = c(1, 10), breaks = seq(0, 10, 1))

Assessing Model Accuracy: Calssifiation (1/2)

Code
make_moons <- function(n_samples = 100, noise = 0.05, random_state = NULL) {
  if (!is.null(random_state)) {
    set.seed(random_state)
  }
  
  n_samples_out <- ceiling(n_samples / 2)
  n_samples_in <- floor(n_samples / 2)
  
  outer_circ_x <- cos(seq(0, pi, length.out = n_samples_out))
  outer_circ_y <- sin(seq(0, pi, length.out = n_samples_out))
  inner_circ_x <- 1 - cos(seq(0, pi, length.out = n_samples_in))
  inner_circ_y <- 1 - sin(seq(0, pi, length.out = n_samples_in)) - 0.5
  
  X <- rbind(
    cbind(outer_circ_x, outer_circ_y),
    cbind(inner_circ_x, inner_circ_y)
  )
  
  y <- c(rep(0, n_samples_out), rep(1, n_samples_in))
  
  if (noise > 0) {
    X <- X + matrix(rnorm(n_samples * 2, sd = noise), ncol = 2)
  }
  
  return(list(X = X, y = y))
}
Code
# Generate moon-shaped data
set.seed(42)  # For reproducibility
result <- make_moons(n_samples = 200, noise = 0.3, random_state = 42)

# Extract features and labels
X <- result$X
y <- result$y

df <- data.frame(X, Class = as.factor(y))

ggplot(df, aes(x = outer_circ_x, y = outer_circ_y)) +
  geom_point(aes(color = Class, shape = Class), size = 2) +
  labs(
    x = "X",
    y = "Y") +
  scale_color_colorblind() +
  theme_light() 

Assessing Model Accuracy: Classification (2/2)

Fit the data to a KNN model

Accuracy

Code
# Split the data into training and testing sets
train_index <- sample(1:nrow(df), nrow(df) * 0.8)
train_data <- df[train_index, ]
test_data <- df[-train_index, ]

# Fit the KNN model
knn_fit <- class::knn(train = train_data[, 1:2], test = test_data[, 1:2], cl = train_data$Class, k = 5)

# Calculate accuracy
accuracy <- mean(knn_fit == test_data$Class)
accuracy
[1] 0.925



Confusion Matrix

Code
# Calculate confusion matrix
confusion_matrix <- table(Predicted = knn_fit, Actual = test_data$Class)
confusion_matrix
         Actual
Predicted  0  1
        0 21  2
        1  1 16

Linear Regression

Simple Linear Regression (1/9)

Linear relationship

\[ Y \approx \beta_0 + \beta_1 X \]


For the Advertising data, we might have:

\[ \textbf{Sales} \approx \beta_0 + \beta_1 \times \textbf{TV} \]

Where \(\beta_0\) is the intercept and \(\beta_1\) is the slope. These are unknown constants.

Once we estimate the coefficients \(\beta_0\) and \(\beta_1\), we can make predictions.

\[ \hat{Y} = \hat{\beta}_0 + \hat{\beta}_1 X \]

Simple Linear Regression (2/9)

Review the Advertising data

Code
advertising_fit <- lm(Sales ~ TV, data = advertising)

advertising_augment <- augment(advertising_fit, advertising)

advertising_augment |> 
  ggplot(aes(TV, Sales)) + 
   geom_segment(aes(xend = TV, yend = .fitted), color = "black", linetype = "solid", linewidth = 0.25) +
  geom_point(color = "red", shape = "circle") +
  geom_smooth(formula = y ~ x, method = "lm", se = FALSE, color = "blue", size = 1) +
  scale_y_continuous(breaks = seq(0, 30, 5)) +
  theme_light()

What is our goal?

  • Find coefficients for \(\beta_0\) and \(\beta_1\)
  • Minimize the sum of squared residuals

\[ \text{RSS} = \sum_{i=1}^{n} \left (y_i - \hat{y}_i \right )^2 \]

Simple Linear Regression (3/9)

Least Squares

The least squares approach chooses \(\hat{\beta}_0\) and \(\hat{\beta}_1\) to minimize the RSS.

\[ RSS = \sum_{i=1}^{n} \left (y_i - (\hat{\beta_0} + \hat{\beta_1}x_i) \right )^2 \] 1. Take the partial derivative of the RSS with respect to \(\beta_0\) and \(\beta_1\)
2. Set the derivatives to zero
3. Solve the resulting system of two equations with two unknowns, \(\beta_0\) and \(\beta_1\)

Solution

\[ \hat{\beta}_1 = \frac{\sum_{i=1}^{n} (x_i - \bar{x})(y_i - \bar{y})}{\sum_{i=1}^{n} (x_i - \bar{x})^2} \]

\[ \hat{\beta}_0 = \bar{y} - \hat{\beta}_1 \bar{x} \]

Simple Linear Regression (4/9)

Linear model in R

Code
# Fit a simple linear regression model
advertising_TV_fit <- lm(Sales ~ TV, data = advertising)

# Print the summary of the model
summary(advertising_TV_fit)

Call:
lm(formula = Sales ~ TV, data = advertising)

Residuals:
    Min      1Q  Median      3Q     Max 
-8.3860 -1.9545 -0.1913  2.0671  7.2124 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 7.032594   0.457843   15.36   <2e-16 ***
TV          0.047537   0.002691   17.67   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 3.259 on 198 degrees of freedom
Multiple R-squared:  0.6119,    Adjusted R-squared:  0.6099 
F-statistic: 312.1 on 1 and 198 DF,  p-value: < 2.2e-16

Extracting the coefficients

Code
# Extract the coefficients
coef(advertising_TV_fit)
(Intercept)          TV 
 7.03259355  0.04753664 
Code
coef(advertising_TV_fit)["TV"]
        TV 
0.04753664 
Code
coef(advertising_TV_fit)["(Intercept)"]
(Intercept) 
   7.032594 

Simple Linear Regression (5/9)

Create a contour plot of the RSS with \(\beta_1\) as a function of \(\beta_0\)

Code
# Create a grid of beta_0 and beta_1 values
beta_0 <- seq(5, 10, length.out = 100)
beta_1 <- seq(0.02, 0.08, length.out = 100)
grid <- expand.grid(beta_0 = beta_0, beta_1 = beta_1)

# Calculate RSS for each combination of beta_0 and beta_1
grid$RSS <- apply(grid, 1, function(row) {
  beta_0 <- row["beta_0"]
  beta_1 <- row["beta_1"]
  sum((advertising$Sales - (beta_0 + beta_1 * advertising$TV))^2)
})

# Create a contour plot
ggplot(grid, aes(x = beta_0, y = beta_1, z = RSS/1000)) +
  geom_contour_filled(breaks = seq(2.1, 3, 0.05)) +
  geom_point(aes(x = coef(advertising_fit)[1], y = coef(advertising_fit)[2]), color = "red", size = 2) +
  labs(title = "RSS Contour Plot",
       x = expression(beta[0]),
       y = expression(beta[1]),
       z = "RSS") +
  theme_light()

Simple Linear Regression (6/9)

Variance of the coefficients

\[ \text{Var}(\hat{\beta}_1) = \frac{\sigma^2}{\sum_{i=1}^{n} (x_i - \bar{x})^2} \]

\[ \text{Var}(\hat{\beta}_0) = \sigma^2 \left [ \frac{1}{n} + \frac{\bar{x}^2}{\sum_{i=1}^{n} (x_i - \bar{x})^2} \right ] \]

where \(\sigma^2\) is the variance of the error term \(\epsilon\).

Assessing the accuracy of the coefficient estimates

Code
# Calculate the standard errors of the coefficient estimates
se <- sqrt(diag(vcov(advertising_fit)))

# Calculate the t-statistics
t_stat <- coef(advertising_fit) / se

# Calculate the p-values
p_value <- 2 * pt(abs(t_stat), df = nrow(advertising) - 2, lower.tail = FALSE)

# Create a data frame with the results
results <- data.frame(
  term = names(coef(advertising_fit)),
  estimate = coef(advertising_fit),
  std_error = se,
  t_statistic = t_stat,
  p_value = p_value
)

# Print the results
results
Code
summary(advertising_fit)

Call:
lm(formula = Sales ~ TV, data = advertising)

Residuals:
    Min      1Q  Median      3Q     Max 
-8.3860 -1.9545 -0.1913  2.0671  7.2124 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 7.032594   0.457843   15.36   <2e-16 ***
TV          0.047537   0.002691   17.67   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 3.259 on 198 degrees of freedom
Multiple R-squared:  0.6119,    Adjusted R-squared:  0.6099 
F-statistic: 312.1 on 1 and 198 DF,  p-value: < 2.2e-16

Simple Linear Regression (7/9)

Confidence intervals for the coefficients

Code
# Calculate the confidence intervals for the coefficients from stats pacakge
conf_int <- confint(advertising_fit)

# Create a data frame with the results
results <- tibble(
  term = names(coef(advertising_fit)),
  estimate = coef(advertising_fit),
  lower = conf_int[, 1],
  upper = conf_int[, 2]
)

# Print the results
results

Simple Linear Regression (8/9)

Plot the data
with confidence interval

Code
advertising_augment |> 
  ggplot(aes(x = TV, y = Sales)) +
  geom_point(color = "red", shape = "circle open", stroke = 1) +
  geom_smooth(formula = y ~ x, method = "lm", se = TRUE, color = "blue", size = 1) +
  labs(x = "TV", y = "Sales",
       title = "Scatter Plot of Advertising Data",
       subtitle = "se = TRUE") +
  scale_y_continuous(breaks = seq(0, 30, 5), limits = c(0, 30)) +
  theme_light()

Plot the data with
confidence and prediction interval

Code
advertising_augment |> 
  ggplot(aes(x = TV, y = Sales)) +
  geom_point(color = "red", shape = "circle open", stroke = 1) +
  geom_smooth(formula = y ~ x, method = "lm", se = TRUE, color = "blue", size = 1) +
  geom_function(fun = function(x) predict(advertising_fit, newdata = data.frame(TV = x), interval = "predict")[, "lwr"],
                color = "black", linetype = "dashed") +
  geom_function(fun = function(x) predict(advertising_fit, newdata = data.frame(TV = x), interval = "predict")[, "upr"],
                color = "black", linetype = "dashed") +
  labs(x = "TV", y = "Sales",
       title = "Scatter Plot of Advertising Data",
       subtitle = "Demonstration of confidence and prediction intervals") +
  scale_y_continuous(breaks = seq(0, 30, 5), limits = c(0, 30)) +
  theme_light()

Simple Linear Regression (9/9)

Assessing the accuracy of the model using the \(R^2\) statistic

\[ R^2 = \frac{\text{Explained Variance}}{\text{Total Variance}} \]


\[ R^2 = \frac{TSS - RSS}{TSS} \quad \text{or} \quad R^2 = 1 - \frac{RSS}{TSS} \]

where TSS is the total sum of squares and RSS is the residual sum of squares.

\[ TSS = \sum_{i=1}^{n} (y_i - \bar{y})^2 \quad \text{and} \quad RSS = \sum_{i=1}^{n} (y_i - \hat{y}_i)^2 \]

\(R^2\) is the proportion of the variance in the response variable that is predictable from the predictor variable and takes values between 0 and 1.

Simple Linear Regression: Adjusted \(R^2\)

\[ \text{Adjusted } R^2 = 1 - \frac{RSS / (n - d -1)}{TSS / (n - 1)} \] where \(n\) is the number of observations and \(d\) is the number of predictors in the model.

Multiple Linear Regression (1/7)

Advertising Data

Code
advertising

Multiple Linear Regression (2/7)

Plot the Advertising Data

Code
p1 <- advertising |> 
  ggplot() +
  geom_point(aes(x = TV, y = Sales), shape = "circle open", color = "red", size = 2, stroke = 1) +
  geom_smooth(aes(x = TV, y = Sales), formula = y~ x, method = "lm", se = FALSE, color = "blue", size = 1) +
  labs(x = "TV", y = "Sales") +
  scale_color_viridis_c() +
  theme_light()

p2 <- advertising |>
  ggplot() +
  geom_point(aes(x = Radio, y = Sales), shape = "circle open", color = "red", size = 2, stroke = 1) +
  geom_smooth(aes(x = Radio, y = Sales), formula = y~ x, method = "lm", se = FALSE, color = "blue", size = 1) +
  labs(x = "Radio", y = "Sales") +
  scale_color_viridis_c() +
  theme_light()

p3 <- advertising |>
  ggplot() +
  geom_point(aes(x = Newspaper, y = Sales), shape = "circle open", color = "red", size = 2, stroke = 1) +
  geom_smooth(aes(x = Newspaper, y = Sales), formula = y~ x, method = "lm", se = FALSE, color = "blue", size = 1) +
  labs(x = "Newspaper", y = "Sales") +
  scale_color_viridis_c() +
  theme_light()

p1 + p2 + p3

Multiple Linear Regression (3/7)

Fit a multiple linear regression model

Code
# Fit a multiple linear regression model
advertising_all_fit <- lm(Sales ~ TV + Radio + Newspaper, data = advertising)

# Print the summary of the model
summary(advertising_all_fit)

Call:
lm(formula = Sales ~ TV + Radio + Newspaper, data = advertising)

Residuals:
    Min      1Q  Median      3Q     Max 
-8.8277 -0.8908  0.2418  1.1893  2.8292 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  2.938889   0.311908   9.422   <2e-16 ***
TV           0.045765   0.001395  32.809   <2e-16 ***
Radio        0.188530   0.008611  21.893   <2e-16 ***
Newspaper   -0.001037   0.005871  -0.177     0.86    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.686 on 196 degrees of freedom
Multiple R-squared:  0.8972,    Adjusted R-squared:  0.8956 
F-statistic: 570.3 on 3 and 196 DF,  p-value: < 2.2e-16

Multiple Linear Regression (4/7)

Plot predicted values against actual values

Code
advertising_all_augment <- augment(advertising_all_fit, advertising)

p1 <- advertising_all_augment |> 
  ggplot(aes(x = Sales, y = .fitted)) +
  geom_point(color = "red", shape = "circle") +
  geom_abline(intercept = 0, slope = 1, color = "blue", linetype = "dashed") +
  labs(x = "Actual Sales", y = "Predicted Sales") +
  scale_x_continuous(breaks = seq(0, 30, 5)) +
  scale_y_continuous(breaks = seq(0, 30, 5)) +
  theme_light()

p2 <- advertising_all_augment |>
  ggplot(aes(x = Sales, y = .resid)) +
  geom_hline(yintercept = 0, color = "black", linetype = "dashed") +
  geom_point(color = "red", shape = "circle") +
  labs(x = "Actual Sales", y = "Residuals") +
  scale_x_continuous(breaks = seq(0, 30, 5)) +
  scale_y_continuous(limits = c(-10, 10), breaks = seq(-10, 10, 5)) +
  theme_light()

p1 / p2 +
  plot_layout(heights = c(3, 1), axes = "collect")

Multiple Linear Regression (5/7)

Interaction between features

We can consider the interaction between features in the model. Mathematically, this would be equivalent to adding a new feature that is the product of two (or more) features.

Fore example, we can create a new column in the dataset that is the product of the TV and Radio columns, TV * Radio; TV and Newspaper, TV * Newspaper; and Radio and Newspaper, Radio * Newspaper.

We can also consider the product of all three columns, TV * Radio * Newspaper.

\[ \begin{multline} \text{Sales} \approx \beta_0 + \beta_1 \times \text{TV} + \beta_2 \times \text{Radio} + \beta_3 \times \text{Newspaper} \\+ \beta_4 \times \text{TV} \times \text{Radio} + \beta_5 \times \text{TV} \times \text{Newspaper} + \beta_6 \times \text{Radio} \times \text{Newspaper} \\+ \beta_7 \times \text{TV} \times \text{Radio} \times \text{Newspaper} \end{multline} \]

Multiple Linear Regression (6/7)

Revisit the Sales vs all markets

Code
# Fit a multiple linear regression model and include a TV^2 term

advertising_crossterms_fit <- lm(Sales ~ TV*Radio*Newspaper, data = advertising)

# Print the summary of the model
summary(advertising_crossterms_fit)

Call:
lm(formula = Sales ~ TV * Radio * Newspaper, data = advertising)

Residuals:
    Min      1Q  Median      3Q     Max 
-5.8955 -0.3883  0.1938  0.5865  1.5240 

Coefficients:
                     Estimate Std. Error t value Pr(>|t|)    
(Intercept)         6.556e+00  4.655e-01  14.083  < 2e-16 ***
TV                  1.971e-02  2.719e-03   7.250 9.95e-12 ***
Radio               1.962e-02  1.639e-02   1.197    0.233    
Newspaper           1.311e-02  1.721e-02   0.761    0.447    
TV:Radio            1.162e-03  9.753e-05  11.909  < 2e-16 ***
TV:Newspaper       -5.545e-05  9.326e-05  -0.595    0.553    
Radio:Newspaper     9.063e-06  4.831e-04   0.019    0.985    
TV:Radio:Newspaper -7.610e-07  2.700e-06  -0.282    0.778    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.9406 on 192 degrees of freedom
Multiple R-squared:  0.9686,    Adjusted R-squared:  0.9675 
F-statistic: 847.3 on 7 and 192 DF,  p-value: < 2.2e-16

Multiple Linear Regression (7/7)

Plot of the Sales vs all markets

Code
advertising_crossterms_augment <- augment(advertising_crossterms_fit, advertising)

p1 <- advertising_crossterms_augment |> 
  ggplot(aes(x = Sales, y = .fitted)) +
  geom_point(color = "red", shape = "circle") +
  geom_abline(intercept = 0, slope = 1, color = "blue", linetype = "dashed") +
  labs(x = "Actual Sales", y = "Predicted Sales") +
  scale_x_continuous(breaks = seq(0, 30, 5)) +
  scale_y_continuous(breaks = seq(0, 30, 5)) +
  theme_light()

p2 <- advertising_crossterms_augment |>
  ggplot(aes(x = Sales, y = .resid)) +
  geom_hline(yintercept = 0, color = "black", linetype = "dashed") +
  geom_point(color = "red", shape = "circle") +
  labs(x = "Actual Sales", y = "Residuals") +
  scale_x_continuous(breaks = seq(0, 30, 5)) +
  scale_y_continuous(limits = c(-10, 10), breaks = seq(-10, 10, 5)) +
  theme_light()

p1 / p2 +
  plot_layout(heights = c(3, 1), axes = "collect")

Assignment: Linear Regression (1/5)

Concrete Compressive Strength

UCI Machine Learning Repository

Using the Concrete_Data.csv dataset, perform a linear regression analysis to predict the compressive strength of concrete using one or more of the other features in the dataset.

What was the best Adjusted \(R^2\) value you achieved?

Code
# New names for columns
concrete_names <- c("Cement", "Blast_Furnace_Slag", "Fly_Ash", "Water", "Superplasticizer", "Coarse_Aggregate", "Fine_Aggregate", "Age", "Compressive_Strength")

# Load the dataset
library(readxl)
concrete_data <- read_xls("datasets/concrete+compressive+strength/Concrete_Data.xls",
                          col_names = concrete_names,
                          skip = 1)

concrete_data

Assignment: Linear Regression (2/5)

EDA of dataset using ggpairs

Code
library(GGally)

concrete_data |> 
  ggpairs(lower = list(continuous = wrap("points", alpha = 1/10)))

Assignment: Linear Regression (3/5)

Fit a linear regression model

Code
# Fit a linear regression model
concrete_fit_a <- lm(Compressive_Strength ~ Cement + Superplasticizer, data = concrete_data)

summary(concrete_fit_a)

Call:
lm(formula = Compressive_Strength ~ Cement + Superplasticizer, 
    data = concrete_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-33.949 -10.032  -0.515   9.117  43.676 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)      9.190242   1.250302    7.35 4.03e-13 ***
Cement           0.074794   0.004036   18.53  < 2e-16 ***
Superplasticizer 0.902460   0.070603   12.78  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 13.47 on 1027 degrees of freedom
Multiple R-squared:  0.3511,    Adjusted R-squared:  0.3498 
F-statistic: 277.8 on 2 and 1027 DF,  p-value: < 2.2e-16

Assignment: Linear Regression (4/5)

Fit a linear regression model to all the features

Code
# Fit a linear regression model
concrete_fit_b <- lm(Compressive_Strength ~ ., data = concrete_data)

summary(concrete_fit_b)

Call:
lm(formula = Compressive_Strength ~ ., data = concrete_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-28.653  -6.303   0.704   6.562  34.446 

Coefficients:
                     Estimate Std. Error t value Pr(>|t|)    
(Intercept)        -23.163756  26.588421  -0.871 0.383851    
Cement               0.119785   0.008489  14.110  < 2e-16 ***
Blast_Furnace_Slag   0.103847   0.010136  10.245  < 2e-16 ***
Fly_Ash              0.087943   0.012585   6.988 5.03e-12 ***
Water               -0.150298   0.040179  -3.741 0.000194 ***
Superplasticizer     0.290687   0.093460   3.110 0.001921 ** 
Coarse_Aggregate     0.018030   0.009394   1.919 0.055227 .  
Fine_Aggregate       0.020154   0.010703   1.883 0.059968 .  
Age                  0.114226   0.005427  21.046  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 10.4 on 1021 degrees of freedom
Multiple R-squared:  0.6155,    Adjusted R-squared:  0.6125 
F-statistic: 204.3 on 8 and 1021 DF,  p-value: < 2.2e-16

Assignment: Linear Regression (5/5)

Compare the models by plotting predicted vs actual values

Code
concrete_augment_a <- augment(concrete_fit_a, concrete_data)
concrete_augment_b <- augment(concrete_fit_b, concrete_data)

p1 <- concrete_augment_a |> 
  ggplot(aes(x = Compressive_Strength, y = .fitted)) +
  geom_point(color = "red", shape = "circle", alpha = 1/5) +
  geom_abline(intercept = 0, slope = 1, color = "blue", linetype = "dashed") +
  labs(x = "Actual Compressive Strength", y = "Predicted Compressive Strength") +
  scale_x_continuous(breaks = seq(0, 100, 10)) +
  scale_y_continuous(breaks = seq(0, 100, 10), limits = c(0, 80)) +
  labs(title = "Cement and Superplasticizer") +
  theme_light()

p2 <- concrete_augment_b |>
  ggplot(aes(x = Compressive_Strength, y = .fitted)) +
  geom_point(color = "red", shape = "circle", alpha = 1/5) +
  geom_abline(intercept = 0, slope = 1, color = "blue", linetype = "dashed") +
  labs(x = "Actual Compressive Strength", y = "Predicted Compressive Strength") +
  scale_x_continuous(breaks = seq(0, 100, 10)) +
  scale_y_continuous(breaks = seq(0, 100, 10), limits = c(0, 80)) +
  labs(title = "All Features") +
  theme_light()

p1 + p2 +
  plot_layout(axes = "collect")

End of Module 6

References